if (interactive() && !str_ends(getwd(), "R/stat_anal/CITY_US")) {
setwd("R/stat_anal/CITY_US")
}
data <- read_excel("CITY_shortname.xls")
data[data == "NA"] <- NA
data[, -(1:2)] <- data.frame(lapply(data[, -(1:2)], as.numeric))
fullnames <- names(read_excel("CITY.xls"))
print(fullnames)
## [1] "CITY City"
## [2] "STATE State"
## [3] "AREA Land area, 1990 (sq.miles)"
## [4] "R_AREA Land area, 1990 (sq.miles), ranks"
## [5] "POP92 Population,1992"
## [6] "R_POP92 Population,1992, ranks"
## [7] "POP80 Population,1980"
## [8] "R_POP80 Population,1980, ranks"
## [9] "POP_CH Population growth rate,1980-1992"
## [10] "R_POP_CH Population growth rate,1980-1992, ranks"
## [11] "POPDEN Population per square mile, 1992"
## [12] "R_POPDEN Population per square mile, 1992, ranks"
## [13] "OLD % older 65 years"
## [14] "R_OLD % older 65 years, ranks"
## [15] "BLACK Black population,1990"
## [16] "R_BLACK Black population,1990, ranks"
## [17] "BLACK% % Black population,1990"
## [18] "R_BLACK% % Black population,1990, ranks"
## [19] "ASIAN Asian or Pacific Islander population,1990"
## [20] "R_ASIAN Asian or Pacific Islander population,1990, ranks"
## [21] "ASIAN% % Asian or Pacific Islander population,1990"
## [22] "R_ASIAN% % Asian or Pacific Islander population,1990, ranks"
## [23] "HISP Hispanic population, 1990"
## [24] "R_HISP Hispanic population, 1990, ranks"
## [25] "HISP% % Hispanic population, 1990"
## [26] "R_HISP% % Hispanic population, 1990, ranks"
## [27] "BORN_F % persons of foreign born"
## [28] "R_BORN_F % persons of foreign born, ranks"
## [29] "LANG % of persons, speaking language other than English at home, 1990"
## [30] "R_LANG % of persons, speaking language other than English at home, 1990, ranks"
## [31] "HH1 % 1-person households, 1990"
## [32] "R_HH1 % 1-person households, 1990, ranks"
## [33] "FAMIL1 % one-parent family households, 1990"
## [34] "R_FAMIL1 % one-parent family households, 1990, ranks"
## [35] "DEATH Infant death rate per 1000 live births,1988"
## [36] "R_DEATH Infant death rate per 1000 live births,1988, ranks"
## [37] "CRIME Crime rate per 100000 population, 1991"
## [38] "R_CRIME Crime rate per 100000 population, 1991, rate"
## [39] "SCHOOL % of elementary and high school enrollment in public schools, 1990"
## [40] "R_SCHOOL % of elementary and high school enrollment in public schools, 1990, ranks"
## [41] "DEGREE % of adults with a bachelor's degree or higher, 1990"
## [42] "R_DEGREE % of adults with a bachelor's degree or higher, 1990, ranks"
## [43] "INCOME Median household income, 1989"
## [44] "R_INCOME Median household income, 1989, ranks"
## [45] "ASSIST % of households, receiving public assistance,1989"
## [46] "R_ASSIST % of households, receiving public assistance,1989, ranks"
## [47] "POVERT % of persons below poverty level, 1989"
## [48] "R_POVERT % of persons below poverty level, 1989, ranks"
## [49] "OLB_BIL % of housing units built 1939 or earlier, 1990"
## [50] "R_OLD_BI % of housing units built 1939 or earlier, 1990, ranks"
## [51] "OWNER Median value of specified owner-occupied housing units ($), 1990"
## [52] "R_OWNER Median value of specified owner-occupied housing units ($), 1990, ranks"
## [53] "RENTER % renter-occupied housing units,1990"
## [54] "R_RENTER % renter-occupied housing units,1990, ranks"
## [55] "GROSS Median gross rent of specified renter-occupied housing units ($), 1990"
## [56] "R_GROSS Median gross rent of specified renter-occupied housing units ($), 1990, ranks"
## [57] "CONDOM % condominimum occupied housing units, 1990"
## [58] "R_CONDOM % condominimum occupied housing units, 1990, ranks"
## [59] "TRANSP % of workers using public transportation"
## [60] "R_TRANSP % of workers using public transportation, ranks"
## [61] "UNEMP Unemployment rate, 1991"
## [62] "R_UNEMP Unemployment rate, 1991, ranks"
## [63] "LABOR Labor force - % change 1980-1990"
## [64] "R_LABOR Labor force - % change 1980-1990, ranks"
## [65] "LAB_F Female civilian labor force participation rate, 1990"
## [66] "R_LAB_F Female civilian labor force participation rate, 1990, ranks"
## [67] "MANLAB % of civilian labor force emploied in manufacturing, 1990"
## [68] "R_MANLAB % of civilian labor force emploied in manufacturing, 1990, ranks"
## [69] "TAXE City government taxes per capita, 1991"
## [70] "R_TAXE City government taxes per capita, 1991, ranks"
## [71] "TEMPER Average daily temperature in July"
## [72] "R_TEMPER Average daily temperature in July, ranks"
## [73] "PRECEP Annual precipitation"
## [74] "R_PRECEP Annual precipitation"
names_interesting <- c("AREA", "POP80", "POP92", "POPDEN", "CRIME", "BORN_F", "POVERT", "INCOME", "UNEMP", "TEMPER")
data <- data %>% select(all_of(c("CITY", "STATE", names_interesting)))
print(head(data))
## # A tibble: 6 × 12
## CITY STATE AREA POP80 POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NEW_… NY 309. 7.07e6 7.31e6 23671 9236 28.4 19.3 29823 8.6 76.8
## 2 LOS_… CA 469. 2.97e6 3.49e6 7436 9730 38.4 18.9 30925 9 74.3
## 3 CHIC… IL 227. 3.01e6 2.77e6 12185 NA 16.9 21.6 26301 8.4 75.1
## 4 HOUS… TX 540. 1.60e6 1.69e6 3131 10824 17.8 20.7 26261 6.1 83.5
## 5 PHIL… PA 135. 1.69e6 1.55e6 11492 6835 6.6 20.3 24603 8 76.7
## 6 SAN_… CA 324 8.76e5 1.15e6 3546 8537 20.9 13.4 10 6.2 71
Город и штат качественные, остальные количественные, ранги были порядковыми.
find_mode_freq <- function(x) {
x <- x[!is.na(x)]
return(max(tabulate(match(x, x))))
}
print(data %>% summarise(across(
all_of(names_interesting),
find_mode_freq
)))
## # A tibble: 1 × 10
## AREA POP80 POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 1 1 2 1 4 2 1 5 3
print(sort(data$UNEMP))
## [1] 2.3 3.2 3.8 3.9 4.1 4.4 4.6 4.7 4.7 4.8 4.8 5.0 5.0 5.0 5.0
## [16] 5.1 5.1 5.1 5.3 5.4 5.4 5.4 5.4 5.4 5.6 5.7 5.9 5.9 5.9 6.0
## [31] 6.0 6.1 6.1 6.1 6.1 6.2 6.2 6.4 6.4 6.5 6.7 6.7 6.7 6.8 6.9
## [46] 6.9 7.0 7.1 7.2 7.2 7.3 7.3 7.3 7.5 7.6 7.7 7.7 7.7 8.0 8.1
## [61] 8.4 8.4 8.5 8.6 9.0 9.0 9.4 9.5 9.6 10.0 10.4 10.6 10.7 11.0 11.9
## [76] 12.6 13.1
Все количественные буду считать непрерывными. возможно UNEMP непрерывный с плохой точностью.
if (interactive()) pdf("ggpairs_unedited.pdf")
ggpairs(
data[, -(1:2)],
lower = list(continuous = wrap("points", alpha = 0.5, size = 0.3)),
diag = list(continuous = "barDiag")
)
if (interactive()) dev.off()
Убираю outliers:
Помечаю некорректные данные в INCOME как NA. Удаляю город из Аляски за плотность населения. Флорида выделсется на BORN_F-INCOME Гаваи выделяются низким уровнем безработицы. странно?
data$INCOME[data$INCOME < 100] <- NA
data <- data %>% filter(STATE != "AK")
Функция, которая логарифмирует, если это сделает выборку симметричнее
log_asymmetric <- function(x) {
if (skewness(x, na.rm = TRUE) < abs(skewness(log(x), na.rm = TRUE))) {
print("default")
return(x)
} else {
print("logged")
return(log(x))
}
}
Автоматически логарифмирую то что имеет асимметрию и длинный хвост справа
data_logged <- data %>%
mutate(across(all_of(names_interesting), log_asymmetric))
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "default"
## [1] "default"
## [1] "logged"
## [1] "default"
# if (interactive()) pdf("ggpairs_logged.pdf")
# ggpairs(
# data_logged[, -(1:2)],
# lower = list(continuous = wrap("points", alpha = 0.5, size = 0.3)),
# diag = list(continuous = "barDiag")
# )
# if (interactive()) dev.off()
выглядит однородно.
print_characteristics <- function(x) {
list(
mean = mean(x, na.rm = TRUE),
var = var(x, na.rm = TRUE),
skewness = skewness(x, na.rm = TRUE),
kurtosis = kurtosis(x, na.rm = TRUE) - 3
)
}
sapply(data_logged %>% select(all_of
(names_interesting)), print_characteristics)
## AREA POP80 POP92 POPDEN CRIME BORN_F
## mean 4.754638 12.9069 13.01226 8.257586 9.206558 1.959747
## var 0.6658984 0.5142076 0.4464288 0.5099096 0.06798296 0.8435754
## skewness 0.1317646 1.453879 1.743773 0.1015321 0.1476689 0.3081197
## kurtosis -0.3907363 3.033026 3.885752 -0.1531417 0.08872069 -0.7431516
## POVERT INCOME UNEMP TEMPER
## mean 18.11842 25282.48 1.873674 77.60132
## var 34.68872 14321837 0.09967048 37.59853
## skewness 0.2243883 -0.2196584 -0.2186434 -0.1426746
## kurtosis -0.3539569 -0.6467894 0.6594639 0.7472063
df <- data_logged %>%
select(-CITY, -STATE) %>%
drop_na()
df_names <- data_logged %>%
drop_na()
ggplot(melt(cor(df))) +
geom_raster(aes(x = Var2, y = Var1, fill = value)) +
geom_text(aes(x = Var2, y = Var1, label = round(value, 2))) +
scale_fill_gradient2() +
theme_dark() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
model_ <- lm(CRIME ~ ., data = df, na.action = na.exclude)
model <- lm.beta(model_)
summary(model)
##
## Call:
## lm(formula = CRIME ~ ., data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41586 -0.11001 -0.00454 0.11585 0.46619
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 9.347e+00 NA 8.941e-01 10.453 1.11e-14 ***
## AREA -6.914e+01 -2.356e+02 1.949e+02 -0.355 0.7241
## POP80 4.567e-01 1.292e+00 2.379e-01 1.920 0.0601 .
## POP92 6.863e+01 1.850e+02 1.949e+02 0.352 0.7261
## POPDEN -6.926e+01 -2.031e+02 1.948e+02 -0.355 0.7236
## BORN_F 1.232e-01 4.412e-01 4.791e-02 2.572 0.0128 *
## POVERT 1.586e-02 3.506e-01 1.231e-02 1.288 0.2030
## INCOME 6.049e-06 9.536e-02 1.659e-05 0.365 0.7167
## UNEMP 1.267e-01 1.575e-01 1.263e-01 1.003 0.3204
## TEMPER 8.670e-03 2.107e-01 5.543e-03 1.564 0.1235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2107 on 55 degrees of freedom
## Multiple R-squared: 0.3476, Adjusted R-squared: 0.2409
## F-statistic: 3.256 on 9 and 55 DF, p-value: 0.003072
ggplot(melt(cov2cor(vcov(model)))) +
geom_raster(aes(x = Var2, y = Var1, fill = value)) +
geom_text(aes(x = Var2, y = Var1, label = round(value, 2))) +
scale_fill_gradient2() +
theme_dark() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
# cov2cor(vcov(model))
hist(residuals(model))
shapiro.test(residuals(model))
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.98567, p-value = 0.656
plot(fitted(model), residuals(model))
abline(h = 0, lty = 2)
есть население в 80 и 92 и другие зависимые
outliers уже убраны, если какие-то и остались, это не так критично
На примере пары признаков строите двумерный доверительный интервал для пары значащих коэффициентов регрессии, интерпретируете: (1) оба признака влияют на результат согласно оценкам коэффициентов регрессии перед ними: или (2) признаки вместе сильно влияют, но не различить, какой из них больше; или (3) непонятно, или оба признака слабо влияют, или оба влияют сильно.
plot_ellipse <- function(model_, id) {
# confidenceEllipse(lm.beta(model_), which.coef = id, vcov. = function(x) cov2cor(vcov(x)))
# points(0, 0, col = "red", pch = 19)
# lines(x = c(0, coef(lm.beta(model_))[id[1]]), c(0, coef(lm.beta(model_))[id[2]]), col = "red")
confidenceEllipse(model_, which.coef = id)
points(0, 0, col = "red", pch = 19)
lines(x = c(0, coef(model_)[id[1]]), c(0, coef(model_)[id[2]]), col = "red")
# if (id[1] != 1 && id[2] != 1) {
# confidenceEllipse(lm.beta(model_), which.coef = id, vcov. = function(x) cov2cor(vcov(x)))
# points(0, 0, col = "red", pch = 19)
# lines(x = c(0, coef(lm.beta(model_))[id[1]]), c(0, coef(lm.beta(model_))[id[2]]), col = "red")
# }
}
plot_ellipse(model_, c(3, 6))
plot_ellipse(model_, c(3, 7))
# plot_ellipse(model, c(3, 8))
# plot_ellipse(model, c(3, 9))
# plot_ellipse(model, c(3, 10))
# plot_ellipse(model_, c(4, 5))
# plot_ellipse(model_, c(6, 7))
Сделаем это вручную на основе таблицы Redundancy. Там «независимые переменные» сравниваются по двум критериям - независимость от других «независимых» признаков и зависимость от зависимой переменной. Объясняете, что означают столбцы, что делать, если эти критерии дают противоречивые рекомендации, решаете, какой признак лучше убрать первым.
# ols_vif_tol(model_)
# ols_correlations(model_)
model_manual <- lm(CRIME ~ ., data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = CRIME ~ ., data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41586 -0.11001 -0.00454 0.11585 0.46619
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.347e+00 8.941e-01 10.453 1.11e-14 ***
## AREA -6.914e+01 1.949e+02 -0.355 0.7241
## POP80 4.567e-01 2.379e-01 1.920 0.0601 .
## POP92 6.863e+01 1.949e+02 0.352 0.7261
## POPDEN -6.926e+01 1.948e+02 -0.355 0.7236
## BORN_F 1.232e-01 4.791e-02 2.572 0.0128 *
## POVERT 1.586e-02 1.231e-02 1.288 0.2030
## INCOME 6.049e-06 1.659e-05 0.365 0.7167
## UNEMP 1.267e-01 1.263e-01 1.003 0.3204
## TEMPER 8.670e-03 5.543e-03 1.564 0.1235
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2107 on 55 degrees of freedom
## Multiple R-squared: 0.3476, Adjusted R-squared: 0.2409
## F-statistic: 3.256 on 9 and 55 DF, p-value: 0.003072
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 AREA 2.689858e-08 3.717668e+07
## 2 POP80 2.618028e-02 3.819668e+01
## 3 POP92 4.298257e-08 2.326524e+07
## 4 POPDEN 3.633518e-08 2.752154e+07
## 5 BORN_F 4.032272e-01 2.479992e+00
## 6 POVERT 1.601332e-01 6.244802e+00
## 7 INCOME 1.735122e-01 5.763285e+00
## 8 UNEMP 4.810074e-01 2.078970e+00
## 9 TEMPER 6.540152e-01 1.529016e+00
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## AREA -0.032 -0.048 -0.039
## POP80 0.028 0.251 0.209
## POP92 -0.015 0.047 0.038
## POPDEN 0.023 -0.048 -0.039
## BORN_F 0.200 0.328 0.280
## POVERT 0.411 0.171 0.140
## INCOME -0.299 0.049 0.040
## UNEMP 0.369 0.134 0.109
## TEMPER 0.162 0.206 0.170
## -------------------------------------------
AIC(model_manual)
## [1] -6.855636
Удаляю AREA из-за VIF и partial
model_manual <- lm(CRIME ~ POP80 + POP92 + POPDEN + INCOME + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = CRIME ~ POP80 + POP92 + POPDEN + INCOME + BORN_F +
## POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4236 -0.1177 -0.0064 0.1200 0.4580
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.434e+00 8.525e-01 11.067 1.02e-15 ***
## POP80 4.704e-01 2.329e-01 2.020 0.0482 *
## POP92 -5.210e-01 2.494e-01 -2.089 0.0413 *
## POPDEN -1.319e-01 5.602e-02 -2.354 0.0221 *
## INCOME 6.354e-06 1.644e-05 0.387 0.7005
## BORN_F 1.250e-01 4.729e-02 2.643 0.0106 *
## POVERT 1.573e-02 1.221e-02 1.288 0.2029
## UNEMP 1.327e-01 1.242e-01 1.068 0.2899
## TEMPER 8.346e-03 5.424e-03 1.539 0.1295
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.209 on 56 degrees of freedom
## Multiple R-squared: 0.3461, Adjusted R-squared: 0.2527
## F-statistic: 3.705 on 8 and 56 DF, p-value: 0.001553
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POP80 0.02688552 37.194742
## 2 POP92 0.02584005 38.699621
## 3 POPDEN 0.43265704 2.311299
## 4 INCOME 0.17397908 5.747817
## 5 BORN_F 0.40744566 2.454315
## 6 POVERT 0.16027769 6.239172
## 7 UNEMP 0.48984345 2.041469
## 8 TEMPER 0.67225320 1.487535
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POP80 0.028 0.261 0.218
## POP92 -0.015 -0.269 -0.226
## POPDEN 0.023 -0.300 -0.254
## INCOME -0.299 0.052 0.042
## BORN_F 0.200 0.333 0.286
## POVERT 0.411 0.170 0.139
## UNEMP 0.369 0.141 0.115
## TEMPER 0.162 0.201 0.166
## -------------------------------------------
AIC(model_manual)
## [1] -8.707016
Удаляю POP80 из-за VIF
model_manual <- lm(CRIME ~ POP92 + POPDEN + INCOME + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = CRIME ~ POP92 + POPDEN + INCOME + BORN_F + POVERT +
## UNEMP + TEMPER, data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45095 -0.12197 -0.02222 0.12243 0.48281
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.325e+00 8.735e-01 10.677 3.23e-15 ***
## POP92 -2.578e-02 4.679e-02 -0.551 0.5837
## POPDEN -9.384e-02 5.416e-02 -1.733 0.0886 .
## INCOME -2.185e-06 1.631e-05 -0.134 0.8938
## BORN_F 7.990e-02 4.280e-02 1.867 0.0671 .
## POVERT 1.682e-02 1.252e-02 1.343 0.1845
## UNEMP 1.441e-01 1.274e-01 1.132 0.2624
## TEMPER 4.401e-03 5.195e-03 0.847 0.4005
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2146 on 57 degrees of freedom
## Multiple R-squared: 0.2985, Adjusted R-squared: 0.2124
## F-statistic: 3.465 on 7 and 57 DF, p-value: 0.003686
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POP92 0.7738140 1.292300
## 2 POPDEN 0.4878151 2.049957
## 3 INCOME 0.1863123 5.367333
## 4 BORN_F 0.5242427 1.907513
## 5 POVERT 0.1605911 6.226996
## 6 UNEMP 0.4908708 2.037196
## 7 TEMPER 0.7724648 1.294557
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POP92 -0.015 -0.073 -0.061
## POPDEN 0.023 -0.224 -0.192
## INCOME -0.299 -0.018 -0.015
## BORN_F 0.200 0.240 0.207
## POVERT 0.411 0.175 0.149
## UNEMP 0.369 0.148 0.126
## TEMPER 0.162 0.112 0.094
## -------------------------------------------
AIC(model_manual)
## [1] -6.137272
Удаляю INCOME из-за VIF и Partial
model_manual <- lm(CRIME ~ POP92 + POPDEN + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = CRIME ~ POP92 + POPDEN + BORN_F + POVERT + UNEMP +
## TEMPER, data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4482 -0.1272 -0.0226 0.1197 0.4762
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.278364 0.793016 11.700 < 2e-16 ***
## POP92 -0.027764 0.044014 -0.631 0.53065
## POPDEN -0.093971 0.053691 -1.750 0.08537 .
## BORN_F 0.077684 0.039156 1.984 0.05200 .
## POVERT 0.018241 0.006621 2.755 0.00783 **
## UNEMP 0.141187 0.124362 1.135 0.26092
## TEMPER 0.004414 0.005150 0.857 0.39493
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2128 on 58 degrees of freedom
## Multiple R-squared: 0.2983, Adjusted R-squared: 0.2257
## F-statistic: 4.109 on 6 and 58 DF, p-value: 0.001672
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POP92 0.8595873 1.163349
## 2 POPDEN 0.4879834 2.049250
## 3 BORN_F 0.6158375 1.623805
## 4 POVERT 0.5648207 1.770473
## 5 UNEMP 0.5060650 1.976031
## 6 TEMPER 0.7727314 1.294111
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POP92 -0.015 -0.083 -0.069
## POPDEN 0.023 -0.224 -0.193
## BORN_F 0.200 0.252 0.218
## POVERT 0.411 0.340 0.303
## UNEMP 0.369 0.147 0.125
## TEMPER 0.162 0.112 0.094
## -------------------------------------------
AIC(model_manual)
## [1] -8.116788
Удаляю POP92 из-за Partial
model_manual <- lm(CRIME ~ POPDEN + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = CRIME ~ POPDEN + BORN_F + POVERT + UNEMP + TEMPER,
## data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44159 -0.12102 -0.04726 0.12545 0.47496
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.001920 0.657539 13.690 < 2e-16 ***
## POPDEN -0.100061 0.052545 -1.904 0.06175 .
## BORN_F 0.072774 0.038178 1.906 0.06150 .
## POVERT 0.018301 0.006586 2.779 0.00731 **
## UNEMP 0.144104 0.123640 1.166 0.24850
## TEMPER 0.004003 0.005082 0.788 0.43410
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2117 on 59 degrees of freedom
## Multiple R-squared: 0.2935, Adjusted R-squared: 0.2336
## F-statistic: 4.901 on 5 and 59 DF, p-value: 0.0008112
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POPDEN 0.5042890 1.982990
## 2 BORN_F 0.6411805 1.559623
## 3 POVERT 0.5649394 1.770101
## 4 UNEMP 0.5067659 1.973298
## 5 TEMPER 0.7853115 1.273380
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POPDEN 0.023 -0.241 -0.208
## BORN_F 0.200 0.241 0.209
## POVERT 0.411 0.340 0.304
## UNEMP 0.369 0.150 0.128
## TEMPER 0.162 0.102 0.086
## -------------------------------------------
AIC(model_manual)
## [1] -9.672381
Удаляю TEMPER из-за Partial
model_manual <- lm(CRIME ~ POPDEN + BORN_F + POVERT + UNEMP, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = CRIME ~ POPDEN + BORN_F + POVERT + UNEMP, data = df,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43465 -0.11715 -0.04416 0.13301 0.47304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.446971 0.335126 28.189 < 2e-16 ***
## POPDEN -0.118801 0.046700 -2.544 0.01356 *
## BORN_F 0.079726 0.037025 2.153 0.03533 *
## POVERT 0.018828 0.006532 2.883 0.00547 **
## UNEMP 0.143795 0.123247 1.167 0.24794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.211 on 60 degrees of freedom
## Multiple R-squared: 0.286, Adjusted R-squared: 0.2384
## F-statistic: 6.01 on 4 and 60 DF, p-value: 0.0003905
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POPDEN 0.6343884 1.576321
## 2 BORN_F 0.6773993 1.476234
## 3 POVERT 0.5708139 1.751885
## 4 UNEMP 0.5067710 1.973278
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POPDEN 0.023 -0.312 -0.278
## BORN_F 0.200 0.268 0.235
## POVERT 0.411 0.349 0.314
## UNEMP 0.369 0.149 0.127
## -------------------------------------------
AIC(model_manual)
## [1] -10.99261
Удаляю UNEMP из-за Partial
model_manual <- lm(CRIME ~ POPDEN + BORN_F + POVERT, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = CRIME ~ POPDEN + BORN_F + POVERT, data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4113 -0.1241 -0.0199 0.1491 0.4475
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.536704 0.327146 29.151 < 2e-16 ***
## POPDEN -0.109661 0.046174 -2.375 0.0207 *
## BORN_F 0.093070 0.035319 2.635 0.0106 *
## POVERT 0.023184 0.005375 4.313 5.98e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2116 on 61 degrees of freedom
## Multiple R-squared: 0.2698, Adjusted R-squared: 0.2339
## F-statistic: 7.514 on 3 and 61 DF, p-value: 0.000233
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POPDEN 0.6527595 1.531958
## 2 BORN_F 0.7488569 1.335369
## 3 POVERT 0.8479393 1.179330
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POPDEN 0.023 -0.291 -0.260
## BORN_F 0.200 0.320 0.288
## POVERT 0.411 0.483 0.472
## -------------------------------------------
AIC(model_manual)
## [1] -11.53442
все
model.start <- lm(CRIME ~ 1, data = df, na.action = na.exclude)
model.end <- model_
stepAIC(model.start, direction = "forward", scope = list(lower = model.start, upper = model.end))
## Start: AIC=-183.55
## CRIME ~ 1
##
## Df Sum of Sq RSS AIC
## + POVERT 1 0.63194 3.1104 -193.58
## + UNEMP 1 0.50967 3.2326 -191.07
## + INCOME 1 0.33444 3.4079 -187.64
## + BORN_F 1 0.15043 3.5919 -184.22
## <none> 3.7423 -183.55
## + TEMPER 1 0.09834 3.6440 -183.28
## + AREA 1 0.00390 3.7384 -181.62
## + POP80 1 0.00296 3.7394 -181.61
## + POPDEN 1 0.00206 3.7403 -181.59
## + POP92 1 0.00087 3.7414 -181.57
##
## Step: AIC=-193.58
## CRIME ~ POVERT
##
## Df Sum of Sq RSS AIC
## + TEMPER 1 0.140358 2.9700 -194.58
## + BORN_F 1 0.125238 2.9851 -194.25
## <none> 3.1104 -193.58
## + UNEMP 1 0.085932 3.0244 -193.40
## + POPDEN 1 0.066834 3.0435 -192.99
## + AREA 1 0.027661 3.0827 -192.16
## + INCOME 1 0.019274 3.0911 -191.98
## + POP80 1 0.007771 3.1026 -191.74
## + POP92 1 0.003654 3.1067 -191.65
##
## Step: AIC=-194.58
## CRIME ~ POVERT + TEMPER
##
## Df Sum of Sq RSS AIC
## + BORN_F 1 0.127679 2.8423 -195.43
## + UNEMP 1 0.104053 2.8660 -194.90
## <none> 2.9700 -194.58
## + INCOME 1 0.022169 2.9478 -193.06
## + POPDEN 1 0.013926 2.9561 -192.88
## + POP92 1 0.005962 2.9641 -192.71
## + POP80 1 0.004451 2.9656 -192.68
## + AREA 1 0.000834 2.9692 -192.60
##
## Step: AIC=-195.43
## CRIME ~ POVERT + TEMPER + BORN_F
##
## Df Sum of Sq RSS AIC
## + POPDEN 1 0.137392 2.7050 -196.66
## <none> 2.8423 -195.43
## + POP92 1 0.042609 2.7997 -194.42
## + UNEMP 1 0.035760 2.8066 -194.26
## + POP80 1 0.022474 2.8199 -193.95
## + AREA 1 0.007277 2.8351 -193.60
## + INCOME 1 0.004518 2.8378 -193.54
##
## Step: AIC=-196.65
## CRIME ~ POVERT + TEMPER + BORN_F + POPDEN
##
## Df Sum of Sq RSS AIC
## <none> 2.7050 -196.66
## + UNEMP 1 0.060878 2.6441 -196.13
## + AREA 1 0.020548 2.6844 -195.15
## + POP92 1 0.020537 2.6844 -195.15
## + POP80 1 0.004259 2.7007 -194.76
## + INCOME 1 0.000961 2.7040 -194.68
##
## Call:
## lm(formula = CRIME ~ POVERT + TEMPER + BORN_F + POPDEN, data = df,
## na.action = na.exclude)
##
## Coefficients:
## (Intercept) POVERT TEMPER BORN_F POPDEN
## 9.093936 0.022670 0.003984 0.086179 -0.090989
stepAIC(model_, direction = "backward")
## Start: AIC=-193.32
## CRIME ~ AREA + POP80 + POP92 + POPDEN + BORN_F + POVERT + INCOME +
## UNEMP + TEMPER
##
## Df Sum of Sq RSS AIC
## - POP92 1 0.005505 2.4469 -195.17
## - AREA 1 0.005589 2.4470 -195.17
## - POPDEN 1 0.005610 2.4470 -195.17
## - INCOME 1 0.005904 2.4473 -195.16
## - UNEMP 1 0.044635 2.4861 -194.14
## - POVERT 1 0.073682 2.5151 -193.38
## <none> 2.4414 -193.32
## - TEMPER 1 0.108623 2.5500 -192.49
## - POP80 1 0.163609 2.6050 -191.10
## - BORN_F 1 0.293714 2.7351 -187.93
##
## Step: AIC=-195.17
## CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT + INCOME + UNEMP +
## TEMPER
##
## Df Sum of Sq RSS AIC
## - INCOME 1 0.006532 2.4535 -197.00
## - UNEMP 1 0.049840 2.4968 -195.86
## - POVERT 1 0.072543 2.5195 -195.27
## <none> 2.4469 -195.17
## - TEMPER 1 0.103532 2.5505 -194.48
## - POP80 1 0.178301 2.6252 -192.60
## - AREA 1 0.190764 2.6377 -192.29
## - POPDEN 1 0.252540 2.6995 -190.79
## - BORN_F 1 0.305201 2.7521 -189.53
##
## Step: AIC=-197
## CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT + UNEMP + TEMPER
##
## Df Sum of Sq RSS AIC
## - UNEMP 1 0.05839 2.5118 -197.47
## <none> 2.4535 -197.00
## - TEMPER 1 0.09910 2.5526 -196.42
## - POVERT 1 0.11872 2.5722 -195.93
## - POP80 1 0.17260 2.6261 -194.58
## - AREA 1 0.18745 2.6409 -194.21
## - POPDEN 1 0.25177 2.7052 -192.65
## - BORN_F 1 0.33852 2.7920 -190.60
##
## Step: AIC=-197.47
## CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT + TEMPER
##
## Df Sum of Sq RSS AIC
## <none> 2.5118 -197.47
## - TEMPER 1 0.09932 2.6112 -196.95
## - POP80 1 0.17256 2.6844 -195.15
## - AREA 1 0.18885 2.7007 -194.76
## - POPDEN 1 0.24590 2.7577 -193.40
## - POVERT 1 0.29614 2.8080 -192.22
## - BORN_F 1 0.44075 2.9526 -188.96
##
## Call:
## lm(formula = CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT +
## TEMPER, data = df, na.action = na.exclude)
##
## Coefficients:
## (Intercept) AREA POP80 POPDEN BORN_F POVERT
## 9.664651 -0.493029 0.447200 -0.613315 0.142206 0.016202
## TEMPER
## 0.008129
stepAIC(model_manual, direction = "both", scope = list(lower = model.start, upper = model.end))
## Start: AIC=-198
## CRIME ~ POPDEN + BORN_F + POVERT
##
## Df Sum of Sq RSS AIC
## <none> 2.7325 -198.00
## + UNEMP 1 0.06062 2.6719 -197.46
## + TEMPER 1 0.02754 2.7049 -196.66
## + AREA 1 0.01468 2.7178 -196.35
## + POP92 1 0.01468 2.7178 -196.35
## + POP80 1 0.00309 2.7294 -196.07
## + INCOME 1 0.00075 2.7317 -196.01
## - POPDEN 1 0.25265 2.9851 -194.25
## - BORN_F 1 0.31106 3.0435 -192.99
## - POVERT 1 0.83344 3.5659 -182.69
##
## Call:
## lm(formula = CRIME ~ POPDEN + BORN_F + POVERT, data = df, na.action = na.exclude)
##
## Coefficients:
## (Intercept) POPDEN BORN_F POVERT
## 9.53670 -0.10966 0.09307 0.02318
modelAIC_ <- lm(CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT + TEMPER, data = df, na.action = na.exclude)
modelAIC <- lm.beta(modelAIC_)
summary(modelAIC)
##
## Call:
## lm(formula = CRIME ~ AREA + POP80 + POPDEN + BORN_F + POVERT +
## TEMPER, data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41000 -0.11529 0.00209 0.11855 0.44871
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 9.664651 NA 0.782360 12.353 < 2e-16 ***
## AREA -0.493029 -1.680253 0.236098 -2.088 0.04118 *
## POP80 0.447200 1.265294 0.224031 1.996 0.05062 .
## POPDEN -0.613315 -1.798657 0.257385 -2.383 0.02048 *
## BORN_F 0.142206 0.509047 0.044576 3.190 0.00229 **
## POVERT 0.016202 0.358145 0.006196 2.615 0.01135 *
## TEMPER 0.008129 0.197515 0.005368 1.514 0.13536
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2081 on 58 degrees of freedom
## Multiple R-squared: 0.3288, Adjusted R-squared: 0.2594
## F-statistic: 4.735 on 6 and 58 DF, p-value: 0.000548
AIC(modelAIC)
## [1] -11.00726
plot_ellipse(model_manual, c(1, 2))
plot_ellipse(model_manual, c(1, 3))
plot_ellipse(model_manual, c(1, 4))
plot_ellipse(model_manual, c(2, 3))
plot_ellipse(model_manual, c(2, 4))
plot_ellipse(model_manual, c(3, 4))
plot_ellipse(modelAIC_, c(2, 3))
plot_ellipse(modelAIC_, c(2, 4))
plot_ellipse(modelAIC_, c(2, 7))
plot_ellipse(modelAIC_, c(3, 4))
plot_ellipse(modelAIC_, c(3, 7))
plot_ellipse(modelAIC_, c(4, 5))
# plot_ellipse(modelAIC_, c(1, 2))
# plot_ellipse(modelAIC_, c(1, 3))
# plot_ellipse(modelAIC_, c(1, 4))
# plot_ellipse(modelAIC_, c(1, 5))
# plot_ellipse(modelAIC_, c(1, 6))
# plot_ellipse(modelAIC_, c(1, 7))
# plot_ellipse(modelAIC_, c(2, 5))
# plot_ellipse(modelAIC_, c(2, 6))
# plot_ellipse(modelAIC_, c(3, 5))
# plot_ellipse(modelAIC_, c(3, 6))
# plot_ellipse(modelAIC_, c(4, 6))
# plot_ellipse(modelAIC_, c(4, 7))
# plot_ellipse(modelAIC_, c(5, 6))
# plot_ellipse(modelAIC_, c(5, 7))
# plot_ellipse(modelAIC_, c(6, 7))
# hist(residuals(modelAIC_))
# shapiro.test(residuals(modelAIC_))
# ols_plot_resid_stud_fit(modelAIC_)
# # print(df_names[c(15, 27, 38, 56, 52), ])
# ols_plot_resid_stud(modelAIC_)
# # print(df_names[56, ])
hist(residuals(model_manual))
shapiro.test(residuals(model_manual))
##
## Shapiro-Wilk normality test
##
## data: residuals(model_manual)
## W = 0.97945, p-value = 0.3524
ols_plot_resid_stud_fit(model_manual)
# print(df_names[c(15, 27, 38, 56, 52), ])
ols_plot_resid_stud(model_manual)
# print(df_names[56, ])
model_manual_ <- model_manual
model_manual <- lm.beta(model_manual)
# ggplot(data_frame(residuals = rstandard(modelAIC), studres = studres(modelAIC)), aes(x = residuals, y = studres)) +
# geom_point() +
# geom_abline(slope = 1, intercept = 0, color = "red")
ggplot(data_frame(residuals = rstandard(model_manual), studres = studres(model_manual)), aes(x = residuals, y = studres)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "red")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
# ggplot(data_frame(residuals = rstandard(modelAIC), studres = studres(modelAIC)), aes(x = residuals, y = studres - residuals)) +
# geom_point() +
# geom_abline(slope = 0, intercept = 0, color = "red")
новые графики, в том числе, график с рычагами - остатками.
plot(model_manual)
plot(sort(mahalanobis_distance(df)$mahal.dist, decreasing = TRUE))
# plot(cooks.distance(modelAIC))
# df_names[which(cooks.distance(modelAIC) > 0.06), ]
plot(sort(cooks.distance(model_manual), decreasing = TRUE))
df_names[which(cooks.distance(model_manual) > 0.1), ]
## # A tibble: 1 × 12
## CITY STATE AREA POP80 POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EL_PASO TX 5.50 13.0 13.2 7.70 9.17 3.15 25.3 23460 2.36 82.3
тут прогноз того что оказалось аутлаером
new <- data_logged %>%
filter(CITY == "EL_PASO") %>%
select(POPDEN, BORN_F, POVERT)
Попытка прогнозировать преступность в Петербуре
predict.lm(model_manual_, new)
## 1
## 9.571922
exp(predict.lm(model_manual_, data.frame(POPDEN = log(13342), BORN_F = log(5.3), POVERT = 6.6), interval = "prediction"))
## fit lwr upr
## 1 6656.181 4127.882 10733.04
exp(predict.lm(model_manual_, data.frame(POPDEN = log(13342), BORN_F = log(5.3), POVERT = 6.6), interval = "confidence"))
## fit lwr upr
## 1 6656.181 5332.49 8308.453
преступность за 2022 год в СПб на 100000 чел.
print(62971 / (7500000 / 100000))
## [1] 839.6133
Плохо согласуется потому что модель построена на американских городах по данным прошлого века. Данные собирались по разному, законы разные…